Restarted abm_dev (Python 3.9.19)
In [ ]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import (
r2_score,
mean_squared_error,
accuracy_score,
confusion_matrix,
classification_report,
)
from sklearn import linear_model
from sklearn import tree
from sklearn.cluster import KMeans
from pumf_2016_data import pumf_2016_util
# StandardScaler preferred over Normalizer, see note 1
# note 1: https://datascience.stackexchange.com/questions/45900/when-to-use-standard-scaler-and-when-normalizer
In [ ]:
df = pumf_2016_util.read_indiv_table()
pumf_2016_util.recode_indiv_table(df)
# list columns
print(list(df.columns))
df_2016 = df.copy()
del df
{'Female': 0, 'Male': 1}
{'No bedroom': 0.0, '1 bedroom': 1.0, '2 bedrooms': 2.0, '3 bedrooms': 3.0, '4 bedrooms': 4.0, '5 bedrooms or more': 5.0, 'Not available': 6.0}
['PPSORT', 'WEIGHT', 'WT1', 'WT2', 'WT3', 'WT4', 'WT5', 'WT6', 'WT7', 'WT8', 'WT9', 'WT10', 'WT11', 'WT12', 'WT13', 'WT14', 'WT15', 'WT16', 'ABOID', 'AGEGRP', 'AGEIMM', 'ATTSCH', 'BedRm', 'BFNMEMB', 'CapGn', 'CFInc', 'CFInc_AT', 'CfSize', 'CFSTAT', 'CHDBN', 'ChldC', 'CIP2011', 'CIP2011_STEM_SUM', 'Citizen', 'CitOth', 'CMA', 'CONDO', 'COW', 'CQPPB', 'DETH123', 'DIST', 'DPGRSUM', 'DTYPE', 'EFDecile', 'EfDIMBM', 'EFInc', 'EFInc_AT', 'EfSize', 'EICBN', 'EmpIn', 'ETHDER', 'FOL', 'FPTWK', 'GENSTAT', 'GovtI', 'GTRfs', 'HCORENEED_IND', 'HDGREE', 'HHInc', 'HHInc_AT', 'HHMRKINC', 'HHSIZE', 'HHTYPE', 'HLAEN', 'HLAFR', 'HLANO', 'HLBEN', 'HLBFR', 'HLBNO', 'IMMCAT5', 'IMMSTAT', 'IncTax', 'Invst', 'KOL', 'LFACT', 'LICO', 'LICO_AT', 'LOC_ST_RES', 'LOCSTUD', 'LoLIMA', 'LoLIMB', 'LoMBM', 'LSTWRK', 'LWAEN', 'LWAFR', 'LWANO', 'LWBEN', 'LWBFR', 'LWBNO', 'MarStH', 'MOB1', 'Mob5', 'MODE', 'MrkInc', 'MTNEn', 'MTNFr', 'MTNNO', 'NAICS', 'NOC16', 'NOCS', 'NOL', 'NOS', 'OASGI', 'OtInc', 'PKID0_1', 'PKID15_24', 'PKID2_5', 'PKID25', 'PKID6_14', 'PKIDS', 'POB', 'POBF', 'POBM', 'POWST', 'PR', 'PR1', 'PR5', 'PresMortG', 'PRIHM', 'PWDUR', 'PWLEAVE', 'PWOCC', 'PWPR', 'REGIND', 'REPAIR', 'Retir', 'ROOMS', 'SempI', 'Sex', 'SHELCO', 'SSGRAD', 'Subsidy', 'Tenur', 'TotInc', 'TotInc_AT', 'VALUE', 'VisMin', 'Wages', 'WKSWRK', 'WRKACT', 'YRIMM', 'AGE', 'AGEBIN', 'MOB1_cat', 'MOB5_cat', 'HHSIZE_int', 'NOCS_cat', 'NOCS_cat_combined', 'Sex_cat', 'BedRm_cat']
In [ ]:
# # %%
# preloaded_fea_file = "data/census_pumf_vancouver_2021.fea"
# data_path = "../data_packages/PUMF_Census_2021_StatCan/ind/data_donnees_2021_ind_v2.csv"
# if not os.path.exists(preloaded_fea_file):
# df = pd.read_csv(data_path)
# # import pyreadstat
# # df_2021, meta = pyreadstat.read_dta(data_path)
# # filter for CMA equals to Vancouver
# # source: https://www23.statcan.gc.ca/imdb/p3VD.pl?Function=getVD&TVD=317043&CVD=317046&CPV=59A&CST=01012016&CLV=3&MLV=5
# # df = df[df["CMA"] == "Vancouver"].reset_index(
# # drop=True
# # ) # filter by Vancouver CMA only
# # save to feather to allow faster loading later
# df.to_feather(preloaded_fea_file)
# else:
# df = pd.read_feather(preloaded_fea_file)
# # list columns
# print(list(df.columns))
# df_2021 = df.copy()
# del df
In [ ]:
df = df_2016
In [ ]:
# population by age
plot_df = pd.pivot_table(
df, values="WEIGHT", index="AGEBIN", columns=None, aggfunc="sum"
).reset_index()
plt.figure(figsize=(12, 6))
chart = sns.barplot(data=plot_df, x="AGEBIN", y="WEIGHT", palette="Paired")
chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment="right")
plt.show()
<ipython-input-5-4cc62d9ae302>:10: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. chart = sns.barplot(data=plot_df, x="AGEBIN", y="WEIGHT", palette="Paired") <ipython-input-5-4cc62d9ae302>:11: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator. chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment="right")
In [ ]:
# population of age by household size
plt.figure(figsize=(8, 7))
plot_df = pd.pivot_table(
df, values="WEIGHT", index="AGEBIN", columns="HHSIZE", aggfunc="sum"
)
sns.heatmap(plot_df, annot=False, cmap="Blues") # cmap="flare")
Out[ ]:
<Axes: xlabel='HHSIZE', ylabel='AGEBIN'>
In [ ]:
# population of age by household size, Male
plt.figure(figsize=(8, 7))
plot_df = pd.pivot_table(
df[df["Sex"] == "Male"],
values="WEIGHT",
index="AGEBIN",
columns="HHSIZE",
aggfunc="sum",
)
sns.heatmap(plot_df, annot=False, cmap="Greens") # cmap="flare")
Out[ ]:
<Axes: xlabel='HHSIZE', ylabel='AGEBIN'>
In [ ]:
# population of age by household size, Female
plt.figure(figsize=(8, 7))
plot_df = pd.pivot_table(
df[df["Sex"] == "Female"],
values="WEIGHT",
index="AGEBIN",
columns="HHSIZE",
aggfunc="sum",
)
sns.heatmap(plot_df, annot=False, cmap="Purples") # cmap="flare")
Out[ ]:
<Axes: xlabel='HHSIZE', ylabel='AGEBIN'>
In [ ]:
# mobility status (moving) 5 year by household size
plot_df = pd.pivot_table(
df, values="WEIGHT", index="Mob5", columns="HHSIZE", aggfunc="sum"
)
sns.heatmap(plot_df, annot=False, cmap="Blues") # cmap="flare")
Out[ ]:
<Axes: xlabel='HHSIZE', ylabel='Mob5'>
In [ ]:
# mobility status (moving) 5 year by household size, exclude non movers
plot_df = pd.pivot_table(
df[~df["Mob5"].isin(["Non-migrants", "Non-movers", "Not applicable"])],
values="WEIGHT",
index="Mob5",
columns="HHSIZE",
aggfunc="sum",
observed=True,
)
sns.heatmap(plot_df, annot=False, cmap="Blues") # cmap="flare")
Out[ ]:
<Axes: xlabel='HHSIZE', ylabel='Mob5'>
In [ ]:
# total income by household size
df_processed = df[(df["EmpIn"] < 88888888)]
bins = list(range(0, 175000, 25000))
df_processed["EmpIn_cat"] = pd.cut(df_processed["EmpIn"], bins)
plot_df = pd.pivot_table(
df_processed,
values="WEIGHT",
index="HHSIZE",
columns="EmpIn_cat",
aggfunc="sum",
observed=True,
)
sns.heatmap(plot_df, annot=False, cmap="Blues") # cmap="flare")
<ipython-input-11-f7b35357bf58>:7: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_processed["EmpIn_cat"] = pd.cut(df_processed["EmpIn"], bins)
Out[ ]:
<Axes: xlabel='EmpIn_cat', ylabel='HHSIZE'>
In [ ]:
# total income by household size, with children only
df_processed = df[(df["EmpIn"] < 88888888)]
df_processed = df_processed[(df_processed["PKIDS"] == "One or more")]
bins = list(range(0, 175000, 25000))
df_processed["EmpIn_cat"] = pd.cut(df_processed["EmpIn"], bins)
plot_df = pd.pivot_table(
df_processed,
values="WEIGHT",
index="HHSIZE",
columns="EmpIn_cat",
aggfunc="sum",
observed=True,
)
sns.heatmap(plot_df, annot=False, cmap="Blues") # cmap="flare")
Out[ ]:
<Axes: xlabel='EmpIn_cat', ylabel='HHSIZE'>
In [ ]:
# employment income by hours of work, across sex
plt.figure(figsize=(20, 40))
df_processed = df[(df["EmpIn"] < 88888888)]
bins = list(range(0, 175000, 25000))
df_processed["EmpIn_cat"] = pd.cut(df_processed["EmpIn"], bins)
df_processed2 = df_processed[["Sex", "WKSWRK", "EmpIn_cat", "WEIGHT"]].reset_index()
plot_df = pd.pivot_table(
df_processed,
values="WEIGHT",
index=["Sex", "WKSWRK", "EmpIn_cat"],
columns=None,
aggfunc="sum",
observed=True,
).reset_index()
# sns.heatmap(plot_df, annot=False, cmap="Blues") # cmap="flare")
fg = sns.FacetGrid(plot_df, col="Sex", col_wrap=3, height=6, aspect=0.75)
def draw_heatmap(*args, **kwargs):
data = kwargs.pop("data")
d = data.pivot(index=args[1], columns=args[0], values=args[2])
sns.heatmap(d, **kwargs)
fg.map_dataframe(
draw_heatmap,
"EmpIn_cat",
"WKSWRK",
"WEIGHT",
cmap="Blues",
cbar=False,
square=False,
)
# fg.set(yticks=[])
# fg.set(xticks=[])
plt.show()
<ipython-input-13-3c371c939ffb>:9: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_processed["EmpIn_cat"] = pd.cut(df_processed["EmpIn"], bins)
<Figure size 2000x4000 with 0 Axes>
In [ ]:
# employment income, by work activity, across sex
plt.figure(figsize=(20, 40))
df_processed = df[(df["EmpIn"] < 88888888)]
bins = list(range(0, 175000, 25000))
df_processed["EmpIn_cat"] = pd.cut(df_processed["EmpIn"], bins)
df_processed2 = df_processed[["Sex", "WRKACT", "EmpIn_cat", "WEIGHT"]].reset_index()
plot_df = pd.pivot_table(
df_processed,
values="WEIGHT",
index=["Sex", "WRKACT", "EmpIn_cat"],
columns=None,
aggfunc="sum",
observed=True,
).reset_index()
# sns.heatmap(plot_df, annot=False, cmap="Blues") # cmap="flare")
fg = sns.FacetGrid(plot_df, col="Sex", col_wrap=3, height=6, aspect=0.6)
def draw_heatmap(*args, **kwargs):
data = kwargs.pop("data")
d = data.pivot(index=args[1], columns=args[0], values=args[2])
sns.heatmap(d, **kwargs)
fg.map_dataframe(
draw_heatmap,
"EmpIn_cat",
"WRKACT",
"WEIGHT",
cmap="Blues",
cbar=False,
square=False,
)
# fg.set(yticks=[])
# fg.set(xticks=[])
plt.show()
<ipython-input-14-b3dc6934e354>:9: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_processed["EmpIn_cat"] = pd.cut(df_processed["EmpIn"], bins)
<Figure size 2000x4000 with 0 Axes>
In [ ]:
# employment income, by employment occupations, across sex
plt.figure(figsize=(20, 40))
df_processed = df[(df["EmpIn"] < 88888888)]
bins = list(range(0, 175000, 25000))
df_processed["EmpIn_cat"] = pd.cut(df_processed["EmpIn"], bins)
df_processed2 = df_processed[["Sex", "NOCS", "EmpIn_cat", "WEIGHT"]].reset_index()
plot_df = pd.pivot_table(
df_processed,
values="WEIGHT",
index=["Sex", "NOCS", "EmpIn_cat"],
columns=None,
aggfunc="sum",
observed=True,
).reset_index()
# sns.heatmap(plot_df, annot=False, cmap="Blues") # cmap="flare")
fg = sns.FacetGrid(plot_df, col="Sex", col_wrap=3, height=6, aspect=0.8)
def draw_heatmap(*args, **kwargs):
data = kwargs.pop("data")
d = data.pivot(index=args[1], columns=args[0], values=args[2])
sns.heatmap(d, **kwargs)
fg.map_dataframe(
draw_heatmap,
"EmpIn_cat",
"NOCS",
"WEIGHT",
cmap="Blues",
cbar=False,
square=False,
)
# fg.set(yticks=[])
# fg.set(xticks=[])
plt.show()
<ipython-input-15-4732fe8487ea>:9: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_processed["EmpIn_cat"] = pd.cut(df_processed["EmpIn"], bins)
<Figure size 2000x4000 with 0 Axes>
In [ ]:
# employment status across gender
df_processed = df.copy()
df_processed = df_processed[(df_processed["NOCS_cat"] != "Not")]
df_processed = df_processed[(df_processed["EmpIn"] < 88888888)]
df_processed = df_processed[(df_processed["EmpIn"] != 1)]
df_processed = df_processed[(df_processed["EmpIn"] != -1)]
df_processed = df_processed[(df_processed["EmpIn"] != 0)]
df_processed = df_processed[(df_processed["EmpIn"] > 0)]
df_processed = df_processed[(df_processed["EmpIn"] < 250000)]
df_processed = df_processed.sort_values(["NOCS_cat", "AGEBIN"], ascending=True)
plt.figure(figsize=(10, 5))
chart = sns.violinplot(
data=df_processed,
x="NOCS",
y="EmpIn",
hue="Sex",
hue_order=["Male", "Female"],
split=True,
inner="quart",
cut=0,
palette=sns.color_palette(),
)
chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment="right")
plt.show()
<ipython-input-16-2fb64a6dc72b>:15: UserWarning: The palette list has more values (10) than needed (2), which may not be intended. chart = sns.violinplot( <ipython-input-16-2fb64a6dc72b>:26: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator. chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment="right")
In [ ]:
plt.figure(figsize=(10, 5))
chart = sns.lineplot(
x="NOCS", y="EmpIn", hue="Sex", data=df_processed, errorbar=("ci", 95)
)
chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment="right")
plt.show()
<ipython-input-17-21eff3bcfbf8>:7: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator. chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment="right")
In [ ]:
plt.figure(figsize=(10, 5))
chart = sns.violinplot(
data=df_processed,
x="WKSWRK",
y="EmpIn",
hue="Sex",
hue_order=["Male", "Female"],
split=True,
inner="quart",
cut=0,
palette=sns.color_palette(),
)
chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment="right")
plt.show()
<ipython-input-18-89ab36c3c581>:4: UserWarning: The palette list has more values (10) than needed (2), which may not be intended. chart = sns.violinplot( <ipython-input-18-89ab36c3c581>:15: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator. chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment="right")
In [ ]:
plt.figure(figsize=(10, 5))
chart = sns.violinplot(
data=df_processed,
x="WRKACT",
y="EmpIn",
hue="Sex",
hue_order=["Male", "Female"],
split=True,
inner="quart",
cut=0,
palette=sns.color_palette(),
)
chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment="right")
plt.show()
<ipython-input-19-a371957deac1>:4: UserWarning: The palette list has more values (10) than needed (2), which may not be intended. chart = sns.violinplot( <ipython-input-19-a371957deac1>:15: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator. chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment="right")
In [ ]:
plt.figure(figsize=(10, 5))
chart = sns.violinplot(
data=df_processed[df_processed["HHSIZE"] != "Not available"],
x="HHSIZE",
y="EmpIn",
hue="Sex",
hue_order=["Male", "Female"],
split=True,
inner="quart",
cut=0,
palette=sns.color_palette(),
)
chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment="right")
plt.show()
<ipython-input-20-4a8355507187>:4: UserWarning: The palette list has more values (10) than needed (2), which may not be intended. chart = sns.violinplot( <ipython-input-20-4a8355507187>:15: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator. chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment="right")
In [ ]:
plt.figure(figsize=(10, 5))
chart = sns.lineplot(
x="HHSIZE",
y="EmpIn",
hue="Sex",
data=df_processed[df_processed["HHSIZE"] != "Not available"], # style="event",
)
chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment="right")
plt.show()
<ipython-input-21-c28e057a1d4e>:11: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator. chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment="right")
In [ ]:
df_processed2 = df_processed[df_processed["AGE"] != -1]
df_processed2["AGEBIN"] = df_processed2["AGEBIN"].astype(str)
df_processed2 = df_processed2.sort_values(["AGEBIN"], ascending=True)
plt.figure(figsize=(10, 5))
chart = sns.violinplot(
data=df_processed2,
x="AGEBIN",
y="EmpIn",
hue="Sex",
hue_order=["Male", "Female"],
split=True,
inner="quart",
cut=0,
palette=sns.color_palette(),
)
chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment="right")
plt.show()
<ipython-input-22-092f4026b097>:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_processed2["AGEBIN"] = df_processed2["AGEBIN"].astype(str) <ipython-input-22-092f4026b097>:7: UserWarning: The palette list has more values (10) than needed (2), which may not be intended. chart = sns.violinplot( <ipython-input-22-092f4026b097>:18: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator. chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment="right")
In [ ]:
plt.figure(figsize=(10, 5))
chart = sns.lineplot(
x="AGEBIN", y="EmpIn", hue="Sex", data=df_processed2, errorbar=("ci", 95)
)
chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment="right")
plt.show()
<ipython-input-23-2a14bc8ed973>:7: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator. chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment="right")
In [ ]:
# Household size and age
df_processed3 = df_processed2[~(df_processed2["HHSIZE"] == "Not available")]
df_processed3["HHSIZE4"] = df_processed3["HHSIZE"].astype(str)
df_processed3["HHSIZE4"] = df_processed3["HHSIZE4"].map(
{
"1 person": "1 person",
"2 persons": "2 persons",
"3 persons": "3 persons",
"4 persons": "4+ persons",
"5 persons": "4+ persons",
"6 persons": "4+ persons",
"7 persons": "4+ persons",
}
)
df_processed3 = df_processed3.sort_values(["HHSIZE4", "AGEBIN"], ascending=True)
g = sns.relplot(
data=df_processed3,
x="AGEBIN",
y="EmpIn",
col="HHSIZE4",
hue="Sex",
hue_order=["Male", "Female"],
# style="event",
kind="line",
err_style="band",
col_wrap=4,
height=6,
aspect=0.8,
# palette="Set2",
errorbar=("ci", 95),
)
g.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment="right")
g.add_legend()
<ipython-input-24-b00bf8fe8ab3>:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_processed3["HHSIZE4"] = df_processed3["HHSIZE"].astype(str) <ipython-input-24-b00bf8fe8ab3>:6: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_processed3["HHSIZE4"] = df_processed3["HHSIZE4"].map(
Out[ ]:
<seaborn.axisgrid.FacetGrid at 0x1d7f0ab7b80>
In [ ]:
# full time worker income by gender, across different number of weeks
df_processed3 = df_processed2[
(
~df_processed2["WRKACT"].isin(
[
"Not available",
"Not applicable",
"Didn't work in 2015, worked in 2016",
"Woked before 2015 or never worked",
]
)
)
& (df_processed2["WRKACT"].str.contains("full time"))
]
df_processed3["WRKACT"] = (
df_processed3["WRKACT"]
.astype(str)
.replace({"Worked 1 to 13 weeks full time": "Worked 01 to 13 weeks full time"})
)
df_processed3 = df_processed3.sort_values(["WRKACT", "AGEBIN"], ascending=True)
g = sns.relplot(
data=df_processed3,
x="AGEBIN",
y="EmpIn",
col="WRKACT",
# col="HHSIZE",
hue="Sex",
hue_order=["Male", "Female"],
# style="event",
kind="line",
err_style="band",
col_wrap=5,
height=6,
aspect=0.8,
# palette="Set2",
errorbar=("ci", 95),
)
g.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment="right")
g.add_legend()
<ipython-input-25-660c8cd086a1>:16: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_processed3["WRKACT"] = (
Out[ ]:
<seaborn.axisgrid.FacetGrid at 0x1d7e65bbcd0>
In [ ]:
# part time worker income by gender, across different number of weeks
df_processed3 = df_processed2[
(
~df_processed2["WRKACT"].isin(
[
"Not available",
"Not applicable",
"Didn't work in 2015, worked in 2016",
"Woked before 2015 or never worked",
]
)
)
& (df_processed2["WRKACT"].str.contains("part time"))
]
df_processed3["WRKACT"] = (
df_processed3["WRKACT"]
.astype(str)
.replace({"Worked 1 to 13 weeks full time": "Worked 01 to 13 weeks full time"})
)
df_processed3 = df_processed3.sort_values(["WRKACT", "AGEBIN"], ascending=True)
g = sns.relplot(
data=df_processed3,
x="AGEBIN",
y="EmpIn",
col="WRKACT",
# col="HHSIZE",
hue="Sex",
hue_order=["Male", "Female"],
# style="event",
kind="line",
err_style="band",
col_wrap=5,
height=6,
aspect=0.8,
# palette="Set2",
errorbar=("ci", 95),
)
g.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment="right")
g.add_legend()
<ipython-input-26-99aba2d0cb4b>:16: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_processed3["WRKACT"] = (
Out[ ]:
<seaborn.axisgrid.FacetGrid at 0x1d7e61f90d0>
In [ ]:
# Weeks worked across occupation – All workers
df_processed2 = df_processed2.sort_values(["NOCS_cat", "AGEBIN"], ascending=True)
g = sns.relplot(
data=df_processed2,
x="AGEBIN",
y="EmpIn",
col="NOCS_cat",
hue="Sex",
hue_order=["Male", "Female"],
# style="event",
kind="line",
col_wrap=5,
height=6,
aspect=0.8,
# palette="Set2",
errorbar=("ci", 95),
)
g.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment="right")
g.add_legend()
Out[ ]:
<seaborn.axisgrid.FacetGrid at 0x1d7f2cb3160>
In [ ]:
# Weeks worked across occupation – Full time Full year
# df_processed3 = df_processed2[df_processed2["WRKACT"] != "Worked 49 to 52 weeks full time"]
df_processed3 = df_processed2.copy()
df_processed3 = df_processed3[
(
df_processed3["WRKACT"].isin(
["Worked 40 to 48 weeks full time", "Worked 49 to 52 weeks full time"]
)
)
]
g = sns.relplot(
data=df_processed3,
x="AGEBIN",
y="EmpIn",
col="NOCS_cat",
hue="Sex",
hue_order=["Male", "Female"],
# style="event",
kind="line",
col_wrap=5,
height=6,
aspect=0.8,
# palette="Set2",
errorbar=("ci", 95),
)
g.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment="right")
g.add_legend()
Out[ ]:
<seaborn.axisgrid.FacetGrid at 0x1d7f0cd7b80>
In [ ]:
# Weeks worked across occupation – Part time or Part year
# df_processed3 = df_processed2[df_processed2["WRKACT"] != "Worked 49 to 52 weeks full time"]
df_processed3 = df_processed2.copy()
df_processed3 = df_processed3[
~(
df_processed3["WRKACT"].isin(
["Worked 40 to 48 weeks full time", "Worked 49 to 52 weeks full time"]
)
)
]
g = sns.relplot(
data=df_processed3,
x="AGEBIN",
y="EmpIn",
col="NOCS_cat",
hue="Sex",
hue_order=["Male", "Female"],
# style="event",
kind="line",
col_wrap=5,
height=6,
aspect=0.8,
# palette="Set2",
errorbar=("ci", 95),
)
g.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment="right")
g.add_legend()
Out[ ]:
<seaborn.axisgrid.FacetGrid at 0x1d7f08c42b0>
In [ ]:
# Weeks worked across occupation – Part time or Part year - across weeks worked
# df_processed3 = df_processed2[df_processed2["WRKACT"] != "Worked 49 to 52 weeks full time"]
df_processed3 = df_processed2.copy()
df_processed3 = df_processed3[
~(
df_processed3["WRKACT"].isin(
["Worked 40 to 48 weeks full time", "Worked 49 to 52 weeks full time"]
)
)
]
df_processed3["WRKACT"] = df_processed3["WRKACT"].astype(str)
g = sns.relplot(
data=df_processed3,
x="NOCS_cat",
y="EmpIn",
col="WRKACT",
hue="Sex",
hue_order=["Male", "Female"],
# style="event",
kind="line",
col_wrap=5,
height=6,
aspect=0.8,
# palette="Set2",
errorbar=("ci", 95),
)
g.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment="right")
g.add_legend()
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) File c:\Users\bwen\OneDrive - TransLink\PROJECTS\census-pumf\pumf_analysis\analysis.py:29 12 df_processed3["WRKACT"] = df_processed3["WRKACT"].astype(str) 14 g = sns.relplot( 15 data=df_processed3, 16 x="NOCS_cat", (...) 27 errorbar=("ci", 95), 28 ) ---> 29 g.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment="right") 30 g.add_legend() File c:\Users\bwen\.conda\envs\abm_dev\lib\site-packages\seaborn\axisgrid.py:929, in FacetGrid.set_xticklabels(self, labels, step, **kwargs) 927 ax.set_xticklabels(curr_labels, **kwargs) 928 else: --> 929 ax.set_xticklabels(labels, **kwargs) 930 return self File c:\Users\bwen\.conda\envs\abm_dev\lib\site-packages\matplotlib\axes\_base.py:74, in _axis_method_wrapper.__set_name__.<locals>.wrapper(self, *args, **kwargs) 73 def wrapper(self, *args, **kwargs): ---> 74 return get_method(self)(*args, **kwargs) File c:\Users\bwen\.conda\envs\abm_dev\lib\site-packages\matplotlib\axis.py:2062, in Axis.set_ticklabels(self, labels, minor, fontdict, **kwargs) 2058 elif isinstance(locator, mticker.FixedLocator): 2059 # Passing [] as a list of labels is often used as a way to 2060 # remove all tick labels, so only error for > 0 labels 2061 if len(locator.locs) != len(labels) and len(labels) != 0: -> 2062 raise ValueError( 2063 "The number of FixedLocator locations" 2064 f" ({len(locator.locs)}), usually from a call to" 2065 " set_ticks, does not match" 2066 f" the number of labels ({len(labels)}).") 2067 tickd = {loc: lab for loc, lab in zip(locator.locs, labels)} 2068 func = functools.partial(self._format_with_dict, tickd) ValueError: The number of FixedLocator locations (10), usually from a call to set_ticks, does not match the number of labels (13).
In [ ]:
# Full time part time may be a predictor for woman's income
#
Cell was canceled due to an error in a previous cell.
In [ ]:
# Analysis of wages
# LFACT == 1
df = df[(df.Wages < 88888888) & (df.LFACT == "Employed - Worked in reference week")]
df = df.reset_index(drop=True)
Cell was canceled due to an error in a previous cell.
In [ ]:
sns.boxplot(data=df, x="Sex", y="Wages")
Cell was canceled due to an error in a previous cell.
In [ ]:
sns.kdeplot(data=df, hue="Sex", x="Wages", bw_adjust=1.7)
Cell was canceled due to an error in a previous cell.
In [ ]:
sns.scatterplot(data=df, x="Wages", y="AGE")
sns.rugplot(data=df, x="Wages", y="AGE")
Cell was canceled due to an error in a previous cell.
In [ ]:
sns.kdeplot(data=df, x="Wages", hue="Sex")
sns.rugplot(data=df, x="Wages", hue="Sex")
Cell was canceled due to an error in a previous cell.
In [ ]:
df[df.Wages < 150000].plot.hexbin(x="Wages", y="AGE", gridsize=20)
Cell was canceled due to an error in a previous cell.
In [ ]:
wages_mobility = pd.pivot_table(
df, index="Mob5", columns="MOB1", values="Wages", aggfunc="mean"
)
sns.heatmap(wages_mobility, annot=False, cmap="crest", vmin=30000, vmax=100000)
Cell was canceled due to an error in a previous cell.
In [ ]:
print(list(df.columns))
Cell was canceled due to an error in a previous cell.
Regression Models
In [ ]:
# variable recoding
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html
# https://stackoverflow.com/questions/58101126/using-scikit-learn-onehotencoder-with-a-pandas-dataframe
# # try OneHotEncoder
# oht_encode = OneHotEncoder()
# oht_encode.fit(df['MOB5_cat'].unique().reshape(-1, 1))
# transformed = oht_encode.transform(
# df['MOB5_cat'].to_numpy().reshape(-1, 1)).toarray()
# ohe_df = pd.DataFrame(transformed, columns=oht_encode.get_feature_names_out())
# df = pd.concat([df, ohe_df], axis=1).drop(['MOB5_cat'], axis=1)
# do the same thing with get dummies
df = pd.get_dummies(df, prefix=["MOB5_cat"], columns=["MOB5_cat"], drop_first=False)
df = pd.get_dummies(df, prefix=["MOB1_cat"], columns=["MOB1_cat"], drop_first=False)
df = pd.get_dummies(df, prefix=["NOCS_cat"], columns=["NOCS_cat"], drop_first=False)
Cell was canceled due to an error in a previous cell.
In [ ]:
# variable selection
# y - Wages
# X
# - MOB1_cat: moved in the last year
# - MOB5_cat: moved in the last 5 years
# "HHSIZE_int", "Sex_cat", "BedRm_cat", "MOB5_cat_moved_to_canada_5", "MOB5_cat_moved_within_canada_5", "MOB5_cat_not_moved_5", "MOB1_cat_moved_to_canada_1", "MOB1_cat_moved_within_canada_1", "MOB1_cat_not_moved_1"
# A1. simple regression model
pred_var = "Wages"
variable_list = [
"AGE",
"Sex_cat",
"BedRm_cat",
"MOB5_cat_moved_to_canada_5",
"MOB5_cat_moved_within_canada_5",
# "MOB5_cat_not_moved_5",
# "MOB1_cat_moved_to_canada_1",
# "MOB1_cat_moved_within_canada_1",
"MOB1_cat_not_moved_1",
"NOCS_cat_Not",
"NOCS_cat_art_trades_manuf",
# "NOCS_cat_busi",
"NOCS_cat_health_edu_law",
"NOCS_cat_mgmt_sci",
"NOCS_cat_sales_agri",
]
X, y = df[variable_list], df[pred_var]
X = sm.add_constant(X)
variable_list.append("const")
model = sm.OLS(y, X).fit()
print(model.summary())
Cell was canceled due to an error in a previous cell.
In [ ]:
# A2. Simple regression model with formula (simple OLS)
# using formula is very convenient for specifying different models
# https://www.statsmodels.org/stable/api.html#statsmodels-formula-api
# print(' + '.join(variable_list))
results = smf.ols(
"Wages ~ np.log(AGE) + Sex_cat + BedRm_cat + MOB5_cat_moved_to_canada_5 + MOB5_cat_moved_within_canada_5 + MOB1_cat_not_moved_1 + NOCS_cat_Not + NOCS_cat_art_trades_manuf + NOCS_cat_health_edu_law + NOCS_cat_mgmt_sci + NOCS_cat_sales_agri",
data=df,
).fit()
print(results.summary())
Cell was canceled due to an error in a previous cell.
In [ ]:
# read: https://machinelearningresearch.quora.com/What-is-difference-between-ordinary-linear-regression-and-generalized-linear-model-or-linear-regression
# A3. Generalize Linear
results = smf.gls(
"Wages ~ np.log(AGE) + Sex_cat + BedRm_cat + MOB5_cat_moved_to_canada_5 + MOB5_cat_moved_within_canada_5 + MOB1_cat_not_moved_1 + NOCS_cat_Not + NOCS_cat_art_trades_manuf + NOCS_cat_health_edu_law + NOCS_cat_mgmt_sci + NOCS_cat_sales_agri",
data=df,
).fit()
print(results.summary())
Cell was canceled due to an error in a previous cell.
Model options from StatsModel¶
- OLS: Ordinary Least Square - https://www.statsmodels.org/stable/generated/statsmodels.regression.linear_model.OLS.html
- WLS: Weighted Least Square - https://www.statsmodels.org/stable/generated/statsmodels.regression.linear_model.WLS.html#statsmodels.regression.linear_model.WLS
- GLS: Generalized Least Square - https://www.statsmodels.org/stable/generated/statsmodels.regression.linear_model.GLS.html#statsmodels.regression.linear_model.GLS
- GLSAR: Generalized Least Squares with AR covariance structure - specify covariance structure
- MixedLM: Linear Mixed Effects - https://www.statsmodels.org/stable/generated/statsmodels.regression.mixed_linear_model.MixedLM.html#statsmodels.regression.mixed_linear_model.MixedLM
- GEE, Ordinal GEE, Nominal GEE: Estimating equations / structural equations
- RLM: Robust linear model - https://www.statsmodels.org/stable/generated/statsmodels.robust.robust_linear_model.RLM.html#statsmodels.robust.robust_linear_model.RLM Discrete Choices - https://www.statsmodels.org/stable/examples/notebooks/generated/discrete_choice_overview.html
- Logit: Logistic regression - https://www.statsmodels.org/stable/generated/statsmodels.discrete.discrete_model.Logit.html#statsmodels.discrete.discrete_model.Logit
- Probit: Probit regression - https://www.statsmodels.org/stable/generated/statsmodels.discrete.discrete_model.Probit.html#statsmodels.discrete.discrete_model.Probit
- MNLogit: Multinomial logit regression - https://www.statsmodels.org/stable/generated/statsmodels.discrete.discrete_model.MNLogit.html#statsmodels.discrete.discrete_model.MNLogit
- Poisson: Poisson discrete choice - https://www.statsmodels.org/stable/generated/statsmodels.discrete.discrete_model.Poisson.html#statsmodels.discrete.discrete_model.Poisson
- Negative Binomial discrete choice - https://www.statsmodels.org/stable/generated/statsmodels.discrete.discrete_model.NegativeBinomial.html#statsmodels.discrete.discrete_model.NegativeBinomial Misc:
- Quantile Regression - https://www.statsmodels.org/stable/generated/statsmodels.regression.quantile_regression.QuantReg.html#statsmodels.regression.quantile_regression.QuantReg
- Hazard Regression - https://www.statsmodels.org/stable/generated/statsmodels.duration.hazard_regression.PHReg.html#statsmodels.duration.hazard_regression.PHReg
- GLM Additive Model - https://www.statsmodels.org/stable/generated/statsmodels.gam.generalized_additive_model.GLMGam.html#statsmodels.gam.generalized_additive_model.GLMGam feature selection
- Variance Threshold
In [ ]:
from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=(0.8 * (1 - 0.8)))
sel.fit(X)
print(sel.get_feature_names_out())
Cell was canceled due to an error in a previous cell.
In [ ]:
import math
# B1: ML Regression
# https://scikit-learn.org/stable/supervised_learning.html#supervised-learning
# OLS Linear Regression using sklearn
reg = linear_model.LinearRegression()
reg.fit(X, y)
y_pred = reg.predict(X)
print(reg.intercept_, reg.coef_, reg.score(X, y))
print(f"r2: {r2_score(y, y_pred)}")
print(f"rmse: {math.sqrt(mean_squared_error(y, y_pred))}")
Cell was canceled due to an error in a previous cell.
In [ ]:
# Ridge Regression
reg = linear_model.Ridge(alpha=0.5)
reg.fit(X, y)
y_pred = reg.predict(X)
print(reg.intercept_, reg.coef_, reg.score(X, y))
print(f"r2: {r2_score(y, y_pred)}")
print(f"rmse: {math.sqrt(mean_squared_error(y, y_pred))}")
Cell was canceled due to an error in a previous cell.
In [ ]:
# Lasso Regression
reg = linear_model.Lasso(alpha=0.1)
reg.fit(X, y)
y_pred = reg.predict(X)
print(reg.intercept_, reg.coef_, reg.score(X, y))
print(f"r2: {r2_score(y, y_pred)}")
print(f"rmse: {math.sqrt(mean_squared_error(y, y_pred))}")
Cell was canceled due to an error in a previous cell.
In [ ]:
# Elastic Net with Cross Grid
reg = linear_model.ElasticNetCV(cv=5, random_state=0)
reg.fit(X, y)
y_pred = reg.predict(X)
print(reg.alpha_)
print(reg.intercept_, reg.coef_, reg.score(X, y))
print(f"r2: {r2_score(y, y_pred)}")
print(f"rmse: {math.sqrt(mean_squared_error(y, y_pred))}")
# # %%
# # Support Vector Machine for Regression
# from sklearn import svm
# reg = svm.SVR()
# reg.fit(X, y)
# y_pred = reg.predict(X)
# print(reg.alpha_)
# print(reg.intercept_, reg.coef_, reg.score(X, y))
# print(f"r2: {r2_score(y, y_pred)}")
# print(f"rmse: {math.sqrt(mean_squared_error(y, y_pred))}")
Cell was canceled due to an error in a previous cell.
In [ ]:
# Regression Tree
# https://scikit-learn.org/stable/auto_examples/tree/plot_tree_regression.html
# normalize data
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.normalize.html
transformer = Normalizer().fit(X)
Xt = transformer.transform(X)
reg = tree.DecisionTreeRegressor(max_depth=6)
reg.fit(Xt, y)
y_pred = reg.predict(Xt)
print(f"r2: {r2_score(y, y_pred)}")
print(f"rmse: {math.sqrt(mean_squared_error(y, y_pred))}")
# tree.plot_tree(reg)
# https://stackoverflow.com/questions/68352933/name-of-variables-in-sklearn-pipeline
r = tree.export_text(reg, feature_names=variable_list)
print(r)
pd.DataFrame({"name": list(X.columns), "value": reg.feature_importances_}).head(20)
Cell was canceled due to an error in a previous cell.
In [ ]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
reg = RandomForestClassifier(n_estimators=5)
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
print(f"r2: {r2_score(y_test, y_pred)}")
print(f"rmse: {math.sqrt(mean_squared_error(y_test, y_pred))}")
pd.DataFrame({"name": list(X.columns), "value": reg.feature_importances_}).head(20)
Cell was canceled due to an error in a previous cell.
In [ ]:
# AdaBoost
from sklearn.ensemble import AdaBoostClassifier
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
reg = AdaBoostClassifier(n_estimators=5)
scores = cross_val_score(reg, X_train, y_train, cv=5)
print(f"scores: {scores}")
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
print(f"r2: {r2_score(y_test, y_pred)}")
print(f"rmse: {math.sqrt(mean_squared_error(y_test, y_pred))}")
pd.DataFrame({"name": list(X.columns), "value": reg.feature_importances_}).head(20)
Cell was canceled due to an error in a previous cell.
In [ ]:
# Gradient Boosting
from sklearn.ensemble import GradientBoostingRegressor
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
reg = GradientBoostingRegressor(random_state=0)
scores = cross_val_score(reg, X_train, y_train, cv=5)
print(f"scores: {scores}")
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
print(f"r2: {r2_score(y_test, y_pred)}")
print(f"rmse: {math.sqrt(mean_squared_error(y_test, y_pred))}")
pd.DataFrame({"name": list(X.columns), "value": reg.feature_importances_}).head(20)
Cell was canceled due to an error in a previous cell.
In [ ]:
# Histogram-based Gradient Boosting
# https://scikit-learn.org/stable/modules/ensemble.html#histogram-based-gradient-boosting
from sklearn.ensemble import HistGradientBoostingRegressor
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
reg = HistGradientBoostingRegressor(random_state=0)
scores = cross_val_score(reg, X_train, y_train, cv=5)
print(f"scores: {scores}")
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
print(f"r2: {r2_score(y_test, y_pred)}")
print(f"rmse: {math.sqrt(mean_squared_error(y_test, y_pred))}")
# pd.DataFrame({"name": list(X.columns), "value": reg.feature_importances_}).head(20)
Cell was canceled due to an error in a previous cell.
In [ ]:
# # Artificial Neural Network
# from sklearn.neural_network import MLPClassifier
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
# scaler = StandardScaler()
# scaler.fit(X_train)
# X_train = scaler.transform(X_train)
# X_test = scaler.transform(X_test)
# reg = MLPClassifier(
# solver="lbfgs", alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1
# )
# scores = cross_val_score(reg, X_train, y_train, cv=5)
# print(f"scores: {scores}")
# reg.fit(X_train, y_train)
# y_pred = reg.predict(X_test)
# print(f"r2: {r2_score(y_test, y_pred)}")
# print(f"rmse: {math.sqrt(mean_squared_error(y_test, y_pred))}")
# # ~25 mins
Cell was canceled due to an error in a previous cell.
Classification Models
In [ ]:
# C1: ML Classification
pred_var = "BedRm"
variable_list = [
"AGE",
"Wages",
"Sex_cat",
"MOB5_cat_moved_to_canada_5",
"MOB5_cat_moved_within_canada_5",
# "MOB5_cat_not_moved_5",
# "MOB1_cat_moved_to_canada_1",
# "MOB1_cat_moved_within_canada_1",
"MOB1_cat_not_moved_1",
"NOCS_cat_Not",
"NOCS_cat_art_trades_manuf",
# "NOCS_cat_busi",
"NOCS_cat_health_edu_law",
"NOCS_cat_mgmt_sci",
"NOCS_cat_sales_agri",
]
X, y = df[variable_list], df[pred_var]
y_classes = list(df[pred_var].unique())
Cell was canceled due to an error in a previous cell.
In [ ]:
# Classification Tree
# https://scikit-learn.org/stable/auto_examples/tree/plot_tree_regression.html
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
# normalize data
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.normalize.html
transformer = Normalizer().fit(X)
Xt = transformer.transform(X)
reg = tree.DecisionTreeClassifier(max_depth=6)
reg.fit(Xt, y)
y_pred = reg.predict(Xt)
print(f"score: {accuracy_score(y, y_pred)}")
print(classification_report(y, y_pred, target_names=y_classes))
# tree.plot_tree(reg)
# https://stackoverflow.com/questions/68352933/name-of-variables-in-sklearn-pipeline
r = tree.export_text(reg, feature_names=variable_list)
print(r)
pd.DataFrame({"name": list(X.columns), "value": reg.feature_importances_}).head(20)
# notes:
# Precision - TP / (TP + FP): fraction of true that is actually true
# Recall - TP / (TP + FN): fraction of true that was originally true
# f score - composite (avg) of precision and recall, 1 is perfect
Cell was canceled due to an error in a previous cell.
Clustering Models
In [ ]:
variable_list = [
"AGE",
"Wages",
"Sex_cat",
"MOB5_cat_moved_to_canada_5",
"MOB5_cat_moved_within_canada_5",
# "MOB5_cat_not_moved_5",
# "MOB1_cat_moved_to_canada_1",
# "MOB1_cat_moved_within_canada_1",
"MOB1_cat_not_moved_1",
"NOCS_cat_Not",
"NOCS_cat_art_trades_manuf",
# "NOCS_cat_busi",
"NOCS_cat_health_edu_law",
"NOCS_cat_mgmt_sci",
"NOCS_cat_sales_agri",
]
X = df[variable_list]
Cell was canceled due to an error in a previous cell.
In [ ]:
# D1: ML Clustering
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)
kmeans = KMeans(n_clusters=6, random_state=0).fit(X)
kmeans.labels_
y = kmeans.predict(X)
cc = kmeans.cluster_centers_
# dict(zip(list(range(0,len(cc))), cc))
cc_dict = {("cluster" + str(i)): cc[i] for i in range(0, len(cc))}
view_dict = {"name": variable_list}
view_dict.update(cc_dict)
pd.DataFrame(view_dict)
Cell was canceled due to an error in a previous cell.
In [ ]:
df["cluster"] = y
df.groupby("cluster").sum()["WEIGHT"].round(0) / df.sum()["WEIGHT"] * 100
Cell was canceled due to an error in a previous cell.
In [ ]:
# sns.scatterplot(data=df.sample(100), x="Wages", y="AGE", hue="cluster", size=10)
sns.displot(
data=df.sample(100), x="Wages", y="AGE", hue="cluster", kind="kde", rug=True
)
# sns.displot(
# data=df.sample(100), x="Wages", y="AGE", hue="cluster", kind="hist", rug=True
# )
pivot_df = df.pivot_table(index="cluster", columns="DPGRSUM", aggfunc="sum")["WEIGHT"]
pivot_df / pivot_df.sum() * 100
Cell was canceled due to an error in a previous cell.